Preparations

Load libraries and functions

library("cluster")
library("dendextend")
## 
## ---------------------
## Welcome to dendextend version 1.7.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
source("functions.R")
## Loading required package: ggplot2

Corpus description and selection

Load data

# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "dh-meier-data/output/transkribus/tokenized/boudams/", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_expanded_words.csv")
data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)

Text lengths

nwords = colSums(data)
summary(nwords)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     298    2244    3539    5070    6774   18971
boxplot(nwords)
boxplot(nwords)$out

## 05_Ano_Leg-A_Ap_NA_Vie_Jacques  29_Wau_Leg-C_Co_Ev_Vie_Martin 
##                          17920                          14432 
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial 
##                          18971                          15255
head(sort(nwords), n = 15)
##          03_Ano_Leg-A_Ap_NA_Mar_Jean          62_Ano_Leg-N_NA_NA_NA_Index 
##                                  298                                  301 
##       61_Ano_Leg-B_NA_NA_NA_Jugement       30_Wau_Leg-C_Co_Ev_Tra_Martin2 
##                                  406                                  722 
##      08_Ano_Leg-A_Ap_NA_Vie_Philippe     59_Ano_Leg-C_Vi_NA_Vie_Euphrasie 
##                                 1014                                 1293 
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur         32_Wau_Leg-C_Co_Ev_Vie_Brice 
##                                 1356                                 1385 
##    60_Ano_Leg-B_NA_NA_NA_Antechriste       54_Ano_Leg-C_Vi_NA_Vie_Pelagie 
##                                 1485                                 1506 
##      20_Ano_Leg-B_Ma_Fe_Vie_Felicite          11_Ano_Leg-A_Ap_NA_Vie_Marc 
##                                 1676                                 1820 
##         23_Ano_Leg-B_Ma_Ho_Vie_Sixte    53_Ano_Leg-C_Vi_NA_Vie_Marguerite 
##                                 1894                                 1935 
##       35_Wau_Leg-C_Co_Ev_Vie_Nicolas 
##                                 1960
toKeep = colnames(data)[nwords > 1000]

toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]

df = as.data.frame(nwords)

ggplot(df, aes(x="", y=nwords)) + geom_violin() + geom_boxplot(width=0.3) +  theme(axis.text.y = element_text(size = rel(1.4)), axis.title = element_text(size = rel(1.4))) + xlab("Est. length in words of corpus texts") + scale_y_continuous(breaks=c(0, 2500, 5000, 7500, 10000, 12500, 15000, 17500))

Transkribus raw data

3-grams from raw data

Load data

# Get data with Stylo
#data = stylo::load.corpus.and.parse(corpus.dir = "dh-meier-data/output/transkribus/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_raw_char3grams.csv")
data = read.csv("data/transkr_raw_char3grams.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
Raw3grSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHRaw3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotRaw3grams = cahPlotCol(myCAH, k = 9, main = "Characters 3-grams from raw data (Transkr)")

somCAH = somCluster(d)
somCAHRaw3gr = somCAH
somplotRaw3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Characters 3-grams from raw data (Transkr)")

Transkribus expanded data

Load data

data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]

Forms from expanded data

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
WordsSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHForms = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotForms = cahPlotCol(myCAH, k = 9, main = "Expanded word forms (Transkr/Boudams/Pie)")

somCAH = somCluster(d)
somCAHForms = somCAH
somplotForms = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded word forms (Transkr/Boudams/Pie)")

Affixes from expanded data

# Creating affixes database from all words
dataAffs = countAffixes(data)

Burrows + vector-length norm

d = dataAffs
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
AffixesSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHAffs = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotAffixes = cahPlotCol(myCAH, k = 9, main = "Expanded affixes (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somCAHAffs = somCAH
somplotAffixes = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded affixes (Transkr/Boudams/Pie)")

Unstandardised function words from expanded data

Create function words list

#labels(sort(rowSums(data), decreasing = TRUE)[1:300])
# Avec ou sans pronoms ?
functionWords = source("functionWords.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionWords,]
# save data for robustness checks
FWSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFW = myCAH
# barplot(sort(myCAH$height))
plotFW = cahPlotCol(myCAH, k = 8, main = "Function words with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somCAHFW = somCAH
somplotFW = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words")

Transkribus with linguistic annotation

POS 3-grams

data = read.csv("data/transkr_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 9, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somCAHPOS3gr = somCAH
somplotPOS3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - POS 3-grams")

Lemmas

data = read.csv("data/transkr_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)

Burrows + vector-length norm

d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 9, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somCAHLemmas = somCAH
somplotLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Lemmas")

Function words from lemmas

# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value

Burrows + vector-length norm

d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
plotFL = cahPlotCol(myCAH, k = 8, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somCAHFL = somCAH
somplotFL = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words (lemmas)")

Affixes + POS 3-gr + Function words (lemmas)

data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 9, main = "Affixes + POS 3- grams + Function words (lemmas)")
somCAH = somCluster(d)
somCAHGlob = somCAH
somplotGlob = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")

Exp. word-forms+lemmas

data = rbind(LemmasSave, WordsSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHWordsLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotWordsLemmas = cahPlotCol(myCAH, k = 9, main = "Word forms + lemmas")
somCAH = somCluster(d)
somCAHWordsLemmas = somCAH
somplotWordsLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Word forms + lemmas")

Plots

Reference results on the three feature sets

gridExtra::grid.arrange(plotRaw3grams, somplotRaw3grams, plotGlob, somplotGlob, plotWordsLemmas, somplotWordsLemmas, ncol = 2, left = "Without SOM", right = "With SOM")

Analyses

#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotRaw3grams, plotForms, plotAffixes, plotFW, plotLemmas, plotFL, plotPOS3grams, plotGlob, ncol = 2)

gridExtra::grid.arrange(somplotRaw3grams, somplotForms, somplotAffixes, somplotFW, somplotLemmas, somplotFL, somplotPOS3grams, somplotGlob, ncol = 2)

Robustness

cahList = list(raw3grams = CAHRaw3gr, Affs = CAHAffs, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, FLPOSandAffs = CAHGlob, Forms = CAHForms,  Lemmas = CAHLemmas, WordsLemmas = CAHWordsLemmas, UnnormFW = CAHFW)
#compareHC(cahList, k = 9)

benchmark = benchmarkHC(CAHRaw3gr, cahList, k = 9)
round(benchmark, digits = 2)
##                 N CPWauchier   AC CPREF
## raw3grams    1276       0.98 0.63  1.00
## Affs          774       0.98 0.65  0.86
## FunctLemm     100       1.00 0.69  0.73
## POS3gr        328       0.97 0.68  0.68
## FLPOSandAffs 1202       0.98 0.62  0.83
## Forms         698       0.98 0.63  0.81
## Lemmas        512       0.97 0.59  0.73
## WordsLemmas  1210       0.98 0.62  0.85
## UnnormFW      171       0.98 0.72  0.81
# Now with SOM
cahSOMList = list(raw3grams = somCAHRaw3gr, Affs = somCAHAffs, FunctLemm = somCAHFL, POS3gr = somCAHPOS3gr, FLPOSandAffs = somCAHGlob, Forms = somCAHForms,  Lemmas = somCAHLemmas, WordsLemmas = somCAHWordsLemmas, UnnormFW = somCAHFW)

benchmark = benchmarkHC(CAHRaw3gr, cahSOMList, k = 9)
round(benchmark, digits = 2)
##                N CPWauchier   AC CPREF
## raw3grams    100       0.98 0.79  0.97
## Affs         100       0.97 0.79  0.86
## FunctLemm    100       0.97 0.84  0.63
## POS3gr       100       1.00 0.79  0.68
## FLPOSandAffs 100       0.97 0.76  0.90
## Forms        100       0.98 0.79  0.80
## Lemmas       100       0.92 0.75  0.66
## WordsLemmas  100       0.97 0.76  0.73
## UnnormFW     100       0.98 0.84  0.75

Volatility index

#TODO: add SOMS ?
vol = volatility(cahList, k = 9)

out = merge(round(vol, digits = 2), nwords, by="row.names", all.x=TRUE, all.y=FALSE)

out[order(out[, "V_i"]), ]
##                                 Row.names   V_i     y
## 44          45_Ano_Leg-C_Ap_NA_Pas_Andre2 -0.49 13315
## 28         28_Ano_Leg-B_Ma_Ho_Vie_Clement -0.48  2544
## 5          05_Ano_Leg-A_Ap_NA_Vie_Jacques -0.46 17920
## 11            11_Ano_Leg-A_Ap_NA_Vie_Marc -0.43  1820
## 40           41_Ano_Leg-C_Vi_NA_Vie_Irene -0.42  3145
## 59      60_Ano_Leg-B_NA_NA_NA_Antechriste -0.42  1485
## 42       43_Ano_Leg-C_Vi_NA_Vie_Catherine -0.29  8877
## 43           44_Ano_Leg-C_Ap_NA_Vie_Andre -0.29  3118
## 8         08_Ano_Leg-A_Ap_NA_Vie_Philippe -0.28  1014
## 9    09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur -0.28  1356
## 26         26_Ano_Leg-B_Ma_Ev_Vie_Lambert -0.27  5247
## 45         46_Ano_Leg-B_Co_NA_Pur_Patrice -0.26  7872
## 46      47_Ano_Leg-C_Co_er_Vie_PaulErmite -0.26  3753
## 41       42_Ano_Leg-B_Vi_NA_Ass_NotreDame -0.25  3119
## 27       27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon -0.23  6565
## 57 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne -0.23  5529
## 13       13_Ano_Leg-B_Ma_Ho_Vie_Sebastien -0.22  3539
## 24         24_Ano_Leg-B_Ma_Ho_Vie_Laurent -0.21  3243
## 25       25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte -0.21  2513
## 23           23_Ano_Leg-B_Ma_Ho_Vie_Sixte -0.20  1894
## 14         14_Ano_Leg-B_Ma_Ho_Vie_Vincent -0.16  4838
## 19           19_Ano_Leg-B_Ma_Fe_Vie_Agnes -0.16  4177
## 50        51_Ano_Leg-C_Ma_ho_Vie_Eustache -0.14  3099
## 52      53_Ano_Leg-C_Vi_NA_Vie_Marguerite -0.11  1935
## 53         54_Ano_Leg-C_Vi_NA_Vie_Pelagie -0.11  1506
## 56          57_Ano_Leg-C_Vi_NA_Vie_Julien -0.06  2766
## 58       59_Ano_Leg-C_Vi_NA_Vie_Euphrasie -0.06  1293
## 55        56_Ano_Leg-C_Co_NA_Vie_Mamertin -0.05  2202
## 49         50_Ano_Leg-C_NA_NA_Vie_Placide -0.02  2783
## 31           32_Wau_Leg-C_Co_Ev_Vie_Brice  0.00  1385
## 54          55_Ano_Leg-C_Co_NA_Vie_Simeon  0.00  2894
## 12          12_Ano_Leg-A_Ma_Ho_Vie_Longin  0.06  2244
## 22          22_Ano_Leg-B_Ma_Fe_Vie_Cecile  0.07  6782
## 47         48_Ano_Leg-C_Co_ev_Tra_Benoit2  0.07  3234
## 48            49_Ano_Leg-C_NA_NA_Vie_Maur  0.07  6310
## 51           52_Ano_Leg-C_Co_NA_Vie_Fursi  0.07  2492
## 20        20_Ano_Leg-B_Ma_Fe_Vie_Felicite  0.08  1676
## 6         06_Ano_Leg-A_Ap_NA_Vie_Matthieu  0.13  6447
## 7        07_Ano_Leg-A_Ap_NA_Vie_SimonJude  0.13  6784
## 10      10_Ano_Leg-A_Ap_NA_Vie_Barthelemy  0.13  4360
## 37          38_Wau_Leg-C_Co_Ev_Vie_Jerome  0.14  2425
## 16      16_Ano_Leg-B_Ma_Ho_Vie_Christophe  0.18  9122
## 17          17_Ano_Leg-B_Ma_Fe_Vie_Agathe  0.18  3109
## 18            18_Ano_Leg-B_Ma_Fe_Vie_Luce  0.18  2366
## 21       21_Ano_Leg-B_Ma_Fe_Vie_Christine  0.18  7481
## 1          00_Ano_Leg-A_Ap_Ev_Dis_Pierre1  0.19  6774
## 2          01_Ano_Leg-A_Ap_NA_Vie_Pierre2  0.19  5527
## 3             02_Ano_Leg-A_Ap_NA_Pas_Paul  0.19  4798
## 4          04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev  0.19  4955
## 15         15_Ano_Leg-B_Ma_Ho_Vie_Georges  0.20  4548
## 33         34_Wau_Leg-C_Co_Ev_Vie_Martial  0.44 15255
## 35        36_Wau_Leg-C_Co_Ev_Mir_Nicolas2  0.44 10473
## 36        37_Wau_Leg-C_Co_Ev_Tra_Nicolas3  0.44  8379
## 29          29_Wau_Leg-C_Co_Ev_Vie_Martin  0.59 14432
## 30         31_Wau_Leg-C_Co_Ev_Dia_Martin3  0.59 18971
## 34         35_Wau_Leg-C_Co_Ev_Vie_Nicolas  0.67  1960
## 39          40_Wau_Leg-C_Co_Er_Vie_Alexis  0.67  4103
## 32          33_Wau_Leg-C_Co_Er_Vie_Gilles  0.74  4415
## 38          39_Wau_Leg-C_Co_Ev_Vie_Benoit  0.74 12792
# see if there is a correlation
reg = lm(out[, 3] ~ out[, 2])
summary(reg)
## 
## Call:
## lm(formula = out[, 3] ~ out[, 2])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5974  -2715  -1025   1244  14641 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5173.8      520.6   9.939 4.71e-14 ***
## out[, 2]      4119.6     1648.0   2.500   0.0153 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3994 on 57 degrees of freedom
## Multiple R-squared:  0.09879,    Adjusted R-squared:  0.08298 
## F-statistic: 6.248 on 1 and 57 DF,  p-value: 0.01533
plot(out[, 2], out[, 3])
abline(reg)

# Et la distrib des VI
boxplot(out[, 2])

hist(out[, 2])

# ONLY on the three reference analyses
refCAH = list(raw3grams = CAHRaw3gr, FLPOSandAffs = CAHGlob, WordsLemmas = CAHWordsLemmas)

vol = volatility(refCAH, k = 9)

out = merge(round(vol, digits = 2), nwords, by="row.names", all.x=TRUE, all.y=FALSE)

out[order(out[, "V_i"]), ]
##                                 Row.names   V_i     y
## 5          05_Ano_Leg-A_Ap_NA_Vie_Jacques -0.21 17920
## 44          45_Ano_Leg-C_Ap_NA_Pas_Andre2 -0.13 13315
## 41       42_Ano_Leg-B_Vi_NA_Ass_NotreDame -0.08  3119
## 57 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne -0.08  5529
## 59      60_Ano_Leg-B_NA_NA_NA_Antechriste -0.04  1485
## 28         28_Ano_Leg-B_Ma_Ho_Vie_Clement  0.00  2544
## 40           41_Ano_Leg-C_Vi_NA_Vie_Irene  0.00  3145
## 11            11_Ano_Leg-A_Ap_NA_Vie_Marc  0.03  1820
## 42       43_Ano_Leg-C_Vi_NA_Vie_Catherine  0.11  8877
## 43           44_Ano_Leg-C_Ap_NA_Vie_Andre  0.11  3118
## 23           23_Ano_Leg-B_Ma_Ho_Vie_Sixte  0.17  1894
## 24         24_Ano_Leg-B_Ma_Ho_Vie_Laurent  0.17  3243
## 25       25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte  0.17  2513
## 56          57_Ano_Leg-C_Vi_NA_Vie_Julien  0.22  2766
## 45         46_Ano_Leg-B_Co_NA_Pur_Patrice  0.24  7872
## 46      47_Ano_Leg-C_Co_er_Vie_PaulErmite  0.24  3753
## 1          00_Ano_Leg-A_Ap_Ev_Dis_Pierre1  0.33  6774
## 2          01_Ano_Leg-A_Ap_NA_Vie_Pierre2  0.33  5527
## 3             02_Ano_Leg-A_Ap_NA_Pas_Paul  0.33  4798
## 4          04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev  0.33  4955
## 12          12_Ano_Leg-A_Ma_Ho_Vie_Longin  0.33  2244
## 15         15_Ano_Leg-B_Ma_Ho_Vie_Georges  0.33  4548
## 20        20_Ano_Leg-B_Ma_Fe_Vie_Felicite  0.33  1676
## 27       27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon  0.33  6565
## 47         48_Ano_Leg-C_Co_ev_Tra_Benoit2  0.50  3234
## 48            49_Ano_Leg-C_NA_NA_Vie_Maur  0.50  6310
## 49         50_Ano_Leg-C_NA_NA_Vie_Placide  0.50  2783
## 50        51_Ano_Leg-C_Ma_ho_Vie_Eustache  0.50  3099
## 51           52_Ano_Leg-C_Co_NA_Vie_Fursi  0.50  2492
## 54          55_Ano_Leg-C_Co_NA_Vie_Simeon  0.50  2894
## 31           32_Wau_Leg-C_Co_Ev_Vie_Brice  0.56  1385
## 34         35_Wau_Leg-C_Co_Ev_Vie_Nicolas  0.56  1960
## 37          38_Wau_Leg-C_Co_Ev_Vie_Jerome  0.56  2425
## 39          40_Wau_Leg-C_Co_Er_Vie_Alexis  0.56  4103
## 52      53_Ano_Leg-C_Vi_NA_Vie_Marguerite  0.67  1935
## 53         54_Ano_Leg-C_Vi_NA_Vie_Pelagie  0.67  1506
## 55        56_Ano_Leg-C_Co_NA_Vie_Mamertin  0.67  2202
## 58       59_Ano_Leg-C_Vi_NA_Vie_Euphrasie  0.67  1293
## 6         06_Ano_Leg-A_Ap_NA_Vie_Matthieu  0.71  6447
## 7        07_Ano_Leg-A_Ap_NA_Vie_SimonJude  0.71  6784
## 8         08_Ano_Leg-A_Ap_NA_Vie_Philippe  0.71  1014
## 9    09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur  0.71  1356
## 10      10_Ano_Leg-A_Ap_NA_Vie_Barthelemy  0.71  4360
## 13       13_Ano_Leg-B_Ma_Ho_Vie_Sebastien  0.78  3539
## 14         14_Ano_Leg-B_Ma_Ho_Vie_Vincent  0.78  4838
## 16      16_Ano_Leg-B_Ma_Ho_Vie_Christophe  0.78  9122
## 17          17_Ano_Leg-B_Ma_Fe_Vie_Agathe  0.78  3109
## 18            18_Ano_Leg-B_Ma_Fe_Vie_Luce  0.78  2366
## 19           19_Ano_Leg-B_Ma_Fe_Vie_Agnes  0.78  4177
## 21       21_Ano_Leg-B_Ma_Fe_Vie_Christine  0.78  7481
## 22          22_Ano_Leg-B_Ma_Fe_Vie_Cecile  0.78  6782
## 26         26_Ano_Leg-B_Ma_Ev_Vie_Lambert  0.78  5247
## 29          29_Wau_Leg-C_Co_Ev_Vie_Martin  0.78 14432
## 30         31_Wau_Leg-C_Co_Ev_Dia_Martin3  0.78 18971
## 32          33_Wau_Leg-C_Co_Er_Vie_Gilles  0.78  4415
## 33         34_Wau_Leg-C_Co_Ev_Vie_Martial  0.78 15255
## 35        36_Wau_Leg-C_Co_Ev_Mir_Nicolas2  0.78 10473
## 36        37_Wau_Leg-C_Co_Ev_Tra_Nicolas3  0.78  8379
## 38          39_Wau_Leg-C_Co_Ev_Vie_Benoit  0.78 12792
# see if there is a correlation
reg = lm(out[, 3] ~ out[, 2])
summary(reg)
## 
## Call:
## lm(formula = out[, 3] ~ out[, 2])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4468  -2817  -1376   1266  13418 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4767       1017   4.687 1.77e-05 ***
## out[, 2]        1007       1840   0.547    0.586    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4196 on 57 degrees of freedom
## Multiple R-squared:  0.005227,   Adjusted R-squared:  -0.01223 
## F-statistic: 0.2995 on 1 and 57 DF,  p-value: 0.5863
plot(out[, 2], out[, 3])
abline(reg)

# Et la distrib des VI
boxplot(out[, 2])

hist(out[, 2])